Lab 4

library(ggplot2)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✔ tibble  3.1.7     ✔ dplyr   1.0.9
## ✔ tidyr   1.2.0     ✔ stringr 1.4.0
## ✔ readr   2.1.2     ✔ forcats 0.5.1
## ✔ purrr   0.3.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout

Continous Distributions and Expectations

1. Uniform Distribution

  1. Make a large sample and plot a histogram.

This looks like a skyline where all skyscrapers have approximately the same height. :)

x.4 <- runif(1000)
hist(x.4)

ggplot2 example:

https://dk81.github.io/dkmathstats_site/rmath-uniform-plots.html

  1. Also plot the empirical cdf.

This is close to a straight line.

plot.ecdf(x.4)
abline(a=0,b=1, col = 2)

  1. Make two uniformly distributed samples(let’s say U1 and U2) and examine the distribution of the sum (Distribution of U1 + U2).

Look at the histogram. Does it look like this is uniform?

x.5 <- runif(1000)
x.6 <- runif(1000)
x.7 = x.5 + x.6
hist(x.7)

  1. Plot the empirical cdf. Predict what it will look like before plotting.
plot.ecdf(x.7)

  1. Expected value of Uniform distribution
mean(runif(10000, min = 1, max = 6))
## [1] 3.499009

Example 1: Conditional Distribution

x <- runif(1000)# make a sample from U(0,1)
y <- x[x<0.5] # keep only observations < 0.5
z<- x[x>= 0.5] # keep only observations >= 0.5
head(x)
## [1] 0.27200895 0.07662424 0.64003796 0.14659809 0.91051626 0.18622363
head(y)
## [1] 0.27200895 0.07662424 0.14659809 0.18622363 0.44014818 0.18413936
head(z)
## [1] 0.6400380 0.9105163 0.6620075 0.6897932 0.8234334 0.8623052
plot.ecdf(x)

plot.ecdf(y)

plot.ecdf(z)

The result is essentially a straight line (uniform distribution).

# Create a sequence of numbers between -10 and 10 incrementing by 0.1.
x <- seq(-1, 2, by = .1)

# Choose the mean as 2.5 and standard deviation as 0.5.
y <- dunif(x,0,1)


plot(x,y,col="blue", main ="Density Plot of the Uniform  Distribution", type="l")

# Create a sequence of numbers between -1 and 2 incrementing by 0.2.
x <- seq(-1, 2, by = .1)
 
# Choose the mean as 2.5 and standard deviation as 2. 
y <- punif(x, 0,1)

# Plot the graph.
plot(x,y,type="l", col="red", main ="CDF of the Uniform Distribution" )

2. Exponential Distibution

  1. Density plot
# Initialize some values.

x_lower <- 0
x_upper <- 5

max_height <- max(dexp(x_lower:x_upper, rate = 1, log = FALSE)) #exponential distribution’s density function
max_height
## [1] 1
ggplot(data.frame(x = c(x_lower, x_upper)), aes(x = x)) + xlim(x_lower, x_upper) + 
    ylim(0, max_height) +
    stat_function(fun = dexp, args = list(rate = 1), geom = "area", fill = "blue", alpha = 0.25) + 
    stat_function(fun = dexp, args = list(rate = 1)) + 
    labs(x = "\n x", y = "f(x) \n", title = "Exponential Distribution Density Plot With Rate = 1 \n") + 
    theme(plot.title = element_text(hjust = 0.5), 
          axis.title.x = element_text(face="bold", colour="blue", size = 12),
          axis.title.y = element_text(face="bold", colour="blue", size = 12))

#Reference:https://dk81.github.io/dkmathstats_site/rvisual-cont-prob-dists.html
  1. Multiple Exponential Distribution Plots:
par(mfrow=c(2,2))

hist(rexp(10))
hist(rexp(100))
hist(rexp(1000))
hist(rexp(10000))

library(ggplot2)

x_lower <- 0
x_upper <- 5

max_height2 <- max(dexp(x_lower:x_upper, rate = 1, log = FALSE), dexp(x_lower:x_upper, rate = 2, log = FALSE),
        dexp(x_lower:x_upper, rate = 0.5, log = FALSE))

max_height2
## [1] 2
ggplot(data.frame(x = c(x_lower, x_upper)), aes(x = x)) + xlim(x_lower, x_upper) + 
  ylim(0, max_height2) +
  stat_function(fun = dexp, args = list(rate = 0.5), aes(colour = "0.5")) + 
  stat_function(fun = dexp, args = list(rate = 1), aes(colour = "1")) + 
  stat_function(fun = dexp, args = list(rate = 2), aes(colour = "2")) + 
  scale_color_manual("Rate", values = c("blue", "green", "red")) +
  labs(x = "\n x", y = "f(x) \n", 
       title = "Exponential Distribution Density Plots \n") + 
  theme(plot.title = element_text(hjust = 0.5), 
        axis.title.x = element_text(face="bold", colour="blue", size = 12),
        axis.title.y = element_text(face="bold", colour="blue", size = 12),
        legend.title = element_text(face="bold", size = 10),
        legend.position = "top")

# References:
# https://stackoverflow.com/questions/31792634/adding-legend-to-ggplot2-with-multiple-lines-on-plot
# https://stackoverflow.com/questions/19950219/using-legend-with-stat-function-in-ggplot2

Example 2: Memoryless Property of Exponential Distribution

Consider \(X ~ exponential\), say with rate = 2. What is the distribution of \(Y = X|X > 2\) ? What is the distribution of \(Y - 2\)?

x <- rexp(10000,2)
y <- x[x>2]

cdf.x <- ecdf(x)
cdf.y <- ecdf(y)
cdf.y.2 <- ecdf(y-2)

t <- seq(0,5,by = .01)
plot(t,cdf.x(t), type = 'l', lwd = 2)
lines(t,cdf.y(t), col = 2)
lines(t,cdf.y.2(t), col = 4)
legend(c("X", "Y = X|X>2", "Y-2"), fill = c(1,2,4), x = 'bottomright')

3. Normal Distribution

  1. Generating a normal random variable.

By default, rnorm() creates standard normal random variables with a mean of 0 and a standard deviation of 1. However, the mean and standard deviation can be altered using the mean and sd arguments.

y=rnorm(100)
mean(y)
## [1] -0.1555599
var(y)
## [1] 0.9829083
sqrt(var(y))
## [1] 0.9914173
 sd(y)
## [1] 0.9914173
  1. Density plot
ggplot(data.frame(x = c(-4, 4)), aes(x = x)) +
        stat_function(fun = dnorm)

# Histogram overlaid with kernel density curve
ggplot(data.frame(x = rnorm(1000)), aes(x = x)) + 
    geom_histogram(aes(y=..density..),      # Histogram with density instead of count on y-axis
                   binwidth=.5,
                   colour="black", fill="white") +
    geom_density(alpha=.2, fill="#FF6666")  # Overlay with transparent density plot

  1. CDF of the Normal Distribution
# Create a sequence of numbers between -10 and 10 incrementing by 0.2.
x <- seq(-10,10,by = .2)
 
# Choose the mean as 2.5 and standard deviation as 2. 
y <- pnorm(x, mean = 2.5, sd = 2)

# Plot the graph.
plot(x,y,type="l", col="red", main ="CDF of the Normal Distribution" )

  1. Density of the Normal Distribution
# Create a sequence of numbers between -10 and 10 increment by 0.1.
x <- seq(-10, 10, by = .1)

# Choose the mean as 2.5 and standard deviation as 0.5.
y <- dnorm(x, mean = 2.5, sd = 0.5)


plot(x,y,col="blue", main ="Density of the Normal Distribution")

  1. Quantile Function of the Normal Distribution
# Create a sequence of probability values incrementing by 0.02.
x <- seq(0, 1, by = 0.02)

# Choose the mean as 2 and standard deviation as 3.
y <- qnorm(x, mean = 2, sd = 1)


# Plot the graph.
plot(x,y,col="green", type="l", main ="Quantile Function of the Normal Distribution")

  1. Summation of normal distributions
N1=rnorm(1000)
N2=rnorm(1000,mean=50,sd=.1)
Nsum=N1+N2 #summation of normal distributions
mean(Nsum)
## [1] 50.0467
mean(N1)+mean(N2)
## [1] 50.0467
var(Nsum)
## [1] 1.010747
var(N1)+var(N2)
## [1] 1.008757
set.seed(1)    
hist_data <- data.frame(Nsum)   
colnames(hist_data) = c('x')  
gg <- ggplot(hist_data,aes(x = x, color = 'density')) +  
  geom_histogram(aes(y = ..density..), bins = 7,  fill = '#67B7D1', alpha = 0.5) +  
  geom_density(color = '#67B7D1') +  
  ylab("") + 
  xlab("")  + theme(legend.title=element_blank()) +
  ggtitle("Summation of Normal Distributions")+
  scale_color_manual(values = c('density' = '#67B7D1'))


ggplotly(gg)%>% 
  layout(plot_bgcolor='#e5ecf6',   
         xaxis = list(   
           title='N1+N2'),   
         yaxis = list(   
           title='pdf')) 

You can see that the sum follows a normal distribution. confirming it with a Q-Q plot below.

df <- data.frame(y = Nsum)
p <- ggplot(df, aes(sample = y))
p + stat_qq() + stat_qq_line()+
  ggtitle ("Q-Q plot of the Summation of two Normal Distributions")

#check your Directory file for this graph file
pdf("Figure.pdf")

df <- data.frame(y = Nsum)
p <- ggplot(df, aes(sample = y))
p + stat_qq(color='blue') + stat_qq_line()+
  ggtitle ("Q-Q plot of the Summation of two Normal Distributions")

dev.off()
## quartz_off_screen 
##                 2
  1. Correlation of random normal distributions.
x=rnorm(100)
y=rnorm(100)

plot(x,y,xlab="this is the x-axis",ylab="this is the y-axis", main="Plot of X vs Y")

Example 3: Shapiro Wilk Test

iris data set gives the measurements in centimeters of the variables sepal length, sepal width, petal length and petal width, respectively, for 50 flowers from each of 3 species of iris. The species are Iris setosa, versicolor, and virginica.

my_data <- iris
head(my_data)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa
  1. Density plot:

the density plot provides a visual judgment about whether the distribution is bell shaped.

library("ggpubr")
ggdensity(my_data$Sepal.Length, 
          main = "Density plot of Sepal.Length",
          xlab = "Sepal lenngth")

Looks like multi-model.

my_data_S=my_data[my_data$Species=="setosa",]
head(my_data_S)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa
ggdensity(my_data_S$Sepal.Length, 
          main = "Density plot of Sepal.Length for Setosa",
          xlab = "Sepal lenngth")

  1. Q-Q plot:

Q-Q plot (or quantile-quantile plot) draws the correlation between a given sample and the normal distribution. A 45-degree reference line is also plotted.

library(ggpubr)
ggqqplot(my_data_S$Sepal.Length)

It’s also possible to use the function qqPlot() [in car package]:

library("car")
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
## The following object is masked from 'package:purrr':
## 
##     some
qqPlot(my_data_S$Sepal.Length)

## [1] 15 14

Example 4: shapiro.test()

The R function shapiro.test() can be used to perform the Shapiro-Wilk test of normality for one variable (univariate):

shapiro.test(my_data_S$Sepal.Length)
## 
##  Shapiro-Wilk normality test
## 
## data:  my_data_S$Sepal.Length
## W = 0.9777, p-value = 0.4595

From the output, the p-value > 0.05 implying that the distribution of the data are not significantly different from normal distribution (Rejection rule is if p-value < 0.05 we reject the null hypothesis ).

Here, The null hypothesis of these tests is that “sample distribution is normal”. If the test is significant, the distribution is non-normal.

In other words, we can assume the normality.

For double Checking

shapiro.test(rnorm(1000, mean = 15, sd = 2))
## 
##  Shapiro-Wilk normality test
## 
## data:  rnorm(1000, mean = 15, sd = 2)
## W = 0.99881, p-value = 0.7609

Example 5: Anderson Darling Test

library(nortest)
ad.test(rnorm(1000, mean = 5, sd = 3)) #normal
## 
##  Anderson-Darling normality test
## 
## data:  rnorm(1000, mean = 5, sd = 3)
## A = 0.41598, p-value = 0.332
ad.test(runif(1000, min = 2, max = 4)) #not normal
## 
##  Anderson-Darling normality test
## 
## data:  runif(1000, min = 2, max = 4)
## A = 10.015, p-value < 2.2e-16

Example 6: Kolmogorov-Smirnov Test

x= rnorm(1000, mean = 5, sd = 3)
y=runif(1000, min = 2, max = 4)

# Does x come from a normal(5,3) distribution
ks.test(x,"pnorm",5,3)
## 
##  Asymptotic one-sample Kolmogorov-Smirnov test
## 
## data:  x
## D = 0.038343, p-value = 0.1057
## alternative hypothesis: two-sided
# Does y come from a normal(5,3) distribution
ks.test(y,"pnorm",5,3)
## 
##  Asymptotic one-sample Kolmogorov-Smirnov test
## 
## data:  y
## D = 0.63124, p-value < 2.2e-16
## alternative hypothesis: two-sided
# Do x and y come from the same distribution?
ks.test(x, y)
## 
##  Asymptotic two-sample Kolmogorov-Smirnov test
## 
## data:  x and y
## D = 0.656, p-value < 2.2e-16
## alternative hypothesis: two-sided

4. Gamma Distribution

library(ggplot2)

x_lower <- 0
x_upper <- 5

max_height2 <- max(dgamma(x_lower:x_upper, shape = 1, scale = 1), dgamma(x_lower:x_upper, shape = 2, scale = 1),
        dgamma(x_lower:x_upper, shape = 2, scale = 3))


ggplot(data.frame(x = c(x_lower, x_upper)), aes(x = x)) + xlim(x_lower, x_upper) + 
  ylim(0, max_height2) + #g.1
  stat_function(fun = dgamma, args = list(shape = 1, scale = 1), aes(colour = "g.1")) + 
  stat_function(fun = dgamma, args = list(shape = 2, scale = 1), aes(colour = "g.2")) + #g.2
  stat_function(fun = dgamma, args = list(shape = 2, scale = 3), aes(colour = "g.3")) + #g.3
  scale_color_manual("Distributions", values = c("blue", "green", "red")) +
  labs(x = "\n x", y = "f(x) \n", 
       title = "Gamma Distribution Density Plots \n") + 
  theme(plot.title = element_text(hjust = 0.5), 
        axis.title.x = element_text(face="bold", colour="blue", size = 12),
        axis.title.y = element_text(face="bold", colour="blue", size = 12),
        legend.title = element_text(face="bold", size = 10),
        legend.position = "top")

Eample 7:

On average, someone sends a money order once per 15 minutes. What is the probability someone sends 10 money orders in less than 3 hours?

\(X\sim Gamma(alpha=10,beta=60/15)\) \(P(X<3)?\)

alpha = 10
beta = 15 / 60 #in R we need to put 1/beta
x = 3
# exact
pgamma(q = x, shape = alpha, scale = beta)
## [1] 0.7576078
# simulated
mean(rgamma(n = 10000, shape = alpha, scale = beta) <= x)
## [1] 0.7497

5. t-distribution

  1. Find the 2.5th and 97.5th percentiles of the Student t distribution with 5 degrees of freedom.
qt(c(.025, .975), df=5)   # 5 degrees of freedom 
## [1] -2.570582  2.570582
x=rt(1000,5)
quantile(x,c(0.025,0.975))
##      2.5%     97.5% 
## -2.546692  2.605439
x=rt(100000,5)
quantile(x,c(0.025,0.975)) #much closer
##      2.5%     97.5% 
## -2.576245  2.542914
  1. Plotting the t-distribution
# Creating the vector with the x-values for dt function
x_dt <- seq(- 5, 5, by = 0.01)

# Applying the dt() function
y_dt <- dt(x_dt, df = 3)

# Plotting 
plot(y_dt, type = "l", main = "t-distribution density function example", las=1)

6. Cauchy Distribution

Make a sample from a Cauchy distribution and compute the sample mean and median.

z<- rcauchy(100)
mean(z)
## [1] -0.8805818
summary(z)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -33.2775  -0.7370   0.0000  -0.8806   0.7786  22.7616

Make many means and medians.

x.1 <- replicate(1000,mean(rcauchy(10000)))
x.2 <- replicate(1000, median(rcauchy(10000)))

summary(x.1)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## -412.1741   -1.0458   -0.0313    0.0531    0.8933  314.7184
summary(x.2)
##       Min.    1st Qu.     Median       Mean    3rd Qu.       Max. 
## -0.0551049 -0.0106348 -0.0009373 -0.0008693  0.0095939  0.0483039

Maybe the sample size is too small for computing the mean ?

Plot the ecdf of a suitable sample.

plot.ecdf(rcauchy(1000))

Standard deviation of a Cauchy rv DOES NOT EXIST

boxplot(replicate(1000, sd(rcauchy(100))))

Extra Examples:

Example 8: Conditional Distribution

  • Let \(X.0 \sim U(0,3)\), \(X.1 \sim Poisson(4)\), and \(X = X.0 + X.1\)

  • Make an empirical cdf of X

  • Make an empirical cdf of \(X|X < 8\) in the same plot

  • Make an emoirical cdf of \(X.1|X<8\)

N = 10000
X.0 <- runif(N,0,3)
X.1 <- rpois(N,4)
X <- X.0 + X.1
plot.ecdf(X)

plot.ecdf(X)
X.2 <- X[X<8]
mycdf = ecdf(X.2)
t <- seq(0,10,by = .01)
lines(t,mycdf(t), col = 2)

plot.ecdf(X.1[X<8])